{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !pip3 install promise tensorflow-datasets dill future tensorflow-metadata\n",
    "# !pip3 install tensorflow-datasets tf-sentencepiece sentencepiece tensorflow-text==1.15 tfds-nightly --no-deps\n",
    "# !pip3 install t5[gcp]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# !pip3 install tensorflow==1.15 tensorflow-text==1.15"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'0.5.0'"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import tensorflow as tf\n",
    "import tensorflow_datasets as tfds\n",
    "from t5.data import preprocessors as prep\n",
    "import functools\n",
    "import t5\n",
    "t5.__version__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'grpc://10.76.157.10:8470'"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tpu = tf.distribute.cluster_resolver.TPUClusterResolver('node-1', 'us-central1-a', 'mesolitica-cloud')\n",
    "TPU_ADDRESS = tpu.get_master()\n",
    "TPU_TOPOLOGY = '2x2'\n",
    "TPU_ADDRESS"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "vocab = 'gs://mesolitica-general/t5-vocab/sp10m.cased.t5.model'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "def dumping_dataset(split, shuffle_files=False):\n",
    "    del shuffle_files\n",
    "    ds = tf.data.TextLineDataset(['gs://mesolitica-general/t5-data/dumping-iium.tsv'])\n",
    "\n",
    "    ds = ds.map(\n",
    "      functools.partial(tf.io.decode_csv, record_defaults=[\"\", \"\"],\n",
    "                        field_delim=\"\\t\", use_quote_delim=False),\n",
    "      num_parallel_calls=tf.data.experimental.AUTOTUNE)\n",
    "    ds = ds.map(lambda *ex: dict(zip([\"title\", \"text\"], ex)))\n",
    "    return ds\n",
    "\n",
    "t5.data.TaskRegistry.remove('dumping_txt')\n",
    "t5.data.TaskRegistry.add(\n",
    "    \"dumping_txt\",\n",
    "    dataset_fn=dumping_dataset,\n",
    "    splits=[\"train\"],\n",
    "    text_preprocessor=functools.partial(\n",
    "        t5.data.preprocessors.rekey, key_map={\"inputs\": None, \"targets\": \"text\"}),\n",
    "    token_preprocessor=t5.data.preprocessors.unsupervised,\n",
    "    sentencepiece_model_path=vocab,\n",
    "    metric_fns=[])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "# nq_task = t5.data.TaskRegistry.get(\"dumping_txt\")\n",
    "# ds = nq_task.get_dataset(split=\"train\", sequence_length={\"inputs\": 1024, \"targets\": 32})\n",
    "# for ex in tfds.as_numpy(ds.take(5)):\n",
    "#     print(ex)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "t5.data.MixtureRegistry.remove(\"trivia_all\")\n",
    "t5.data.MixtureRegistry.add(\n",
    "    \"trivia_all\",\n",
    "    ['dumping_txt'],\n",
    "     default_rate=1.0\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "import gin\n",
    "\n",
    "gin.parse_config_file('pretrained_models_base_operative_config.gin')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "MODEL_SIZE = 'base'\n",
    "model_parallelism, train_batch_size, keep_checkpoint_max = {\n",
    "    \"small\": (1, 256, 16),\n",
    "    \"base\": (2, 128, 8),\n",
    "    \"large\": (8, 64, 4),\n",
    "    \"3B\": (8, 16, 1),\n",
    "    \"11B\": (8, 16, 1)}['base']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = t5.models.MtfModel(\n",
    "    model_dir='gs://mesolitica-general/t5-base/',\n",
    "    tpu=TPU_ADDRESS,\n",
    "    tpu_topology=TPU_TOPOLOGY,\n",
    "    model_parallelism=model_parallelism,    \n",
    "    batch_size=train_batch_size,\n",
    "    sequence_length={\"inputs\": 1024, \"targets\": 1024},\n",
    "    learning_rate_schedule=0.003,\n",
    "    save_checkpoints_steps=5000,\n",
    "    iterations_per_loop=100,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Using config: {'_model_dir': 'gs://mesolitica-general/t5-base/', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true\n",
      "cluster_def {\n",
      "  job {\n",
      "    name: \"worker\"\n",
      "    tasks {\n",
      "      key: 0\n",
      "      value: \"10.76.157.10:8470\"\n",
      "    }\n",
      "  }\n",
      "}\n",
      "isolate_session_state: true\n",
      ", '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fb67f2ac080>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': 'grpc://10.76.157.10:8470', '_evaluation_master': 'grpc://10.76.157.10:8470', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1, '_tpu_config': TPUConfig(iterations_per_loop=100, num_shards=None, num_cores_per_replica=1, per_host_input_for_training=4, tpu_job_name=None, initial_infeed_sleep_secs=None, input_partition_dims=None, eval_training_input_configuration=2, experimental_host_call_every_n_steps=1), '_cluster': <tensorflow.python.distribute.cluster_resolver.tpu_cluster_resolver.TPUClusterResolver object at 0x7fb67f28beb8>}\n",
      "INFO:tensorflow:_TPUContext: eval_on_tpu True\n",
      "INFO:tensorflow:Querying Tensorflow master (grpc://10.76.157.10:8470) for TPU system metadata.\n",
      "INFO:tensorflow:Initializing TPU system (master: grpc://10.76.157.10:8470) to fetch topology for model parallelism. This might take a while.\n",
      "INFO:tensorflow:Found TPU system:\n",
      "INFO:tensorflow:*** Num TPU Cores: 8\n",
      "INFO:tensorflow:*** Num TPU Workers: 1\n",
      "INFO:tensorflow:*** Num TPU Cores Per Worker: 8\n",
      "INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, -1, 9607701809975203229)\n",
      "INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 17179869184, 18078171525073511535)\n",
      "INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 17179869184, 9941243816703733777)\n",
      "INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 17179869184, 11191119512901994206)\n",
      "INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 17179869184, 10949086066794026809)\n",
      "INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:4, TPU, 17179869184, 5966843802477955755)\n",
      "INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:5, TPU, 17179869184, 1682211469179347769)\n",
      "INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:6, TPU, 17179869184, 7370203009853248049)\n",
      "INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:7, TPU, 17179869184, 11286708254391041496)\n",
      "INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 17179869184, 15009114820372484948)\n",
      "INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 17179869184, 6685706832561661269)\n",
      "WARNING:tensorflow:From /home/ubuntu/.local/lib/python3.6/site-packages/tensorflow_core/python/ops/resource_variable_ops.py:1630: calling BaseResourceVariable.__init__ (from tensorflow.python.ops.resource_variable_ops) with constraint is deprecated and will be removed in a future version.\n",
      "Instructions for updating:\n",
      "If using Keras pass *_constraint arguments to layers.\n",
      "WARNING:tensorflow:From /home/ubuntu/.local/lib/python3.6/site-packages/tensorflow_core/python/training/training_util.py:236: Variable.initialized_value (from tensorflow.python.ops.variables) is deprecated and will be removed in a future version.\n",
      "Instructions for updating:\n",
      "Use Variable.read_value. Variables in 2.X are initialized automatically both in eager and graph (inside tf.defun) contexts.\n",
      "INFO:tensorflow:Calling model_fn.\n",
      "WARNING:tensorflow:From /home/ubuntu/.local/lib/python3.6/site-packages/tensorflow_core/python/ops/array_ops.py:1475: where (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.\n",
      "Instructions for updating:\n",
      "Use tf.where in 2.0, which has the same broadcast rule as np.where\n",
      "INFO:tensorflow:tokens_length=1137 inputs_length=1024 targets_length=229 noise_density=0.15 mean_noise_span_length=3.0 \n",
      "INFO:tensorflow:enable_2d_tiling: False\n",
      "INFO:tensorflow:num_cores_per_replica: 1\n",
      "INFO:tensorflow:computation_shape: [1, 1, 1]\n",
      "INFO:tensorflow:num_replicas: 8\n",
      "INFO:tensorflow:device_assignment.topology.device_coordinates: [[[0 0 0]\n",
      "  [0 0 1]\n",
      "  [1 0 0]\n",
      "  [1 0 1]\n",
      "  [0 1 0]\n",
      "  [0 1 1]\n",
      "  [1 1 0]\n",
      "  [1 1 1]]]\n",
      "INFO:tensorflow:device_assignment.core_assignment: [[[0 0 0]]\n",
      "\n",
      " [[0 0 1]]\n",
      "\n",
      " [[0 1 0]]\n",
      "\n",
      " [[0 1 1]]\n",
      "\n",
      " [[1 0 0]]\n",
      "\n",
      " [[1 0 1]]\n",
      "\n",
      " [[1 1 0]]\n",
      "\n",
      " [[1 1 1]]]\n",
      "WARNING:tensorflow:SimdMeshImpl ignoring devices ['', '', '', '', '', '', '', '']\n",
      "INFO:tensorflow:SimdMeshImpl init: Shape[batch=4, model=2] LayoutRules{('vocab', 'model'), ('batch', 'batch'), ('d_ff', 'model'), ('ensemble', 'ensemble'), ('experts', 'batch'), ('heads', 'model')}\n",
      "INFO:tensorflow:Device Assignment: <tensorflow.python.tpu.device_assignment.DeviceAssignment object at 0x7fb6646048d0>\n",
      "INFO:tensorflow:serialize_num_microbatches: tokens_per_microbatch_per_replica=8192 batch_dim=Dimension(name='batch', size=128) sequence_length={'inputs': 1024, 'targets': 1024} batch_per_replica=32 num_microbatches=4\n",
      "WARNING:tensorflow:Using default tf glorot_uniform_initializer for variable encoder/block_000/layer_000/SelfAttention/relative_attention_bias  The initialzer will guess the input and output dimensions  based on dimension order.\n",
      "WARNING:tensorflow:Using default tf glorot_uniform_initializer for variable decoder/block_000/layer_000/SelfAttention/relative_attention_bias  The initialzer will guess the input and output dimensions  based on dimension order.\n",
      "INFO:tensorflow:Create pnum_tensor\n",
      "INFO:tensorflow:Casting <dtype: 'int32'> to float32 for allreduce\n",
      "INFO:tensorflow:Variable decoder/block_000/layer_000/SelfAttention/k                  size 589824       slice_size 294912       Shape[d_model=768, heads=768]                               \n",
      "INFO:tensorflow:Variable decoder/block_000/layer_000/SelfAttention/o                  size 589824       slice_size 294912       Shape[heads=768, d_model=768]                               \n",
      "INFO:tensorflow:Variable decoder/block_000/layer_000/SelfAttention/q                  size 589824       slice_size 294912       Shape[d_model=768, heads=768]                               \n",
      "INFO:tensorflow:Variable decoder/block_000/layer_000/SelfAttention/v                  size 589824       slice_size 294912       Shape[d_model=768, heads=768]                               \n",
      "INFO:tensorflow:Variable decoder/block_000/layer_001/EncDecAttention/k                size 589824       slice_size 294912       Shape[d_model=768, heads=768]                               \n",
      "INFO:tensorflow:Variable decoder/block_000/layer_001/EncDecAttention/o                size 589824       slice_size 294912       Shape[heads=768, d_model=768]                               \n",
      "INFO:tensorflow:Variable decoder/block_000/layer_001/EncDecAttention/q                size 589824       slice_size 294912       Shape[d_model=768, heads=768]                               \n",
      "INFO:tensorflow:Variable decoder/block_000/layer_001/EncDecAttention/v                size 589824       slice_size 294912       Shape[d_model=768, heads=768]                               \n",
      "INFO:tensorflow:Variable decoder/block_000/layer_002/DenseReluDense/wi/kernel         size 2359296      slice_size 1179648      Shape[d_model=768, d_ff=3072]                               \n",
      "INFO:tensorflow:Variable decoder/block_000/layer_002/DenseReluDense/wo/kernel         size 2359296      slice_size 1179648      Shape[d_ff=3072, d_model=768]                               \n",
      "INFO:tensorflow:Variable decoder/block_001/layer_000/SelfAttention/k                  size 589824       slice_size 294912       Shape[d_model=768, heads=768]                               \n",
      "INFO:tensorflow:Variable decoder/block_001/layer_000/SelfAttention/o                  size 589824       slice_size 294912       Shape[heads=768, d_model=768]                               \n",
      "INFO:tensorflow:Variable decoder/block_001/layer_000/SelfAttention/q                  size 589824       slice_size 294912       Shape[d_model=768, heads=768]                               \n",
      "INFO:tensorflow:Variable decoder/block_001/layer_000/SelfAttention/v                  size 589824       slice_size 294912       Shape[d_model=768, heads=768]                               \n",
      "INFO:tensorflow:Variable decoder/block_001/layer_001/EncDecAttention/k                size 589824       slice_size 294912       Shape[d_model=768, heads=768]                               \n",
      "INFO:tensorflow:Variable decoder/block_001/layer_001/EncDecAttention/o                size 589824       slice_size 294912       Shape[heads=768, d_model=768]                               \n",
      "INFO:tensorflow:Variable decoder/block_001/layer_001/EncDecAttention/q                size 589824       slice_size 294912       Shape[d_model=768, heads=768]                               \n",
      "INFO:tensorflow:Variable decoder/block_001/layer_001/EncDecAttention/v                size 589824       slice_size 294912       Shape[d_model=768, heads=768]                               \n",
      "INFO:tensorflow:Variable decoder/block_001/layer_002/DenseReluDense/wi/kernel         size 2359296      slice_size 1179648      Shape[d_model=768, d_ff=3072]                               \n",
      "INFO:tensorflow:Variable decoder/block_001/layer_002/DenseReluDense/wo/kernel         size 2359296      slice_size 1179648      Shape[d_ff=3072, d_model=768]                               \n",
      "INFO:tensorflow:Variable decoder/block_002/layer_000/SelfAttention/k                  size 589824       slice_size 294912       Shape[d_model=768, heads=768]                               \n",
      "INFO:tensorflow:Variable decoder/block_002/layer_000/SelfAttention/o                  size 589824       slice_size 294912       Shape[heads=768, d_model=768]                               \n",
      "INFO:tensorflow:Variable decoder/block_002/layer_000/SelfAttention/q                  size 589824       slice_size 294912       Shape[d_model=768, heads=768]                               \n",
      "INFO:tensorflow:Variable decoder/block_002/layer_000/SelfAttention/v                  size 589824       slice_size 294912       Shape[d_model=768, heads=768]                               \n",
      "INFO:tensorflow:Variable decoder/block_002/layer_001/EncDecAttention/k                size 589824       slice_size 294912       Shape[d_model=768, heads=768]                               \n",
      "INFO:tensorflow:Variable decoder/block_002/layer_001/EncDecAttention/o                size 589824       slice_size 294912       Shape[heads=768, d_model=768]                               \n",
      "INFO:tensorflow:Variable decoder/block_002/layer_001/EncDecAttention/q                size 589824       slice_size 294912       Shape[d_model=768, heads=768]                               \n",
      "INFO:tensorflow:Variable decoder/block_002/layer_001/EncDecAttention/v                size 589824       slice_size 294912       Shape[d_model=768, heads=768]                               \n",
      "INFO:tensorflow:Variable decoder/block_002/layer_002/DenseReluDense/wi/kernel         size 2359296      slice_size 1179648      Shape[d_model=768, d_ff=3072]                               \n",
      "INFO:tensorflow:Variable decoder/block_002/layer_002/DenseReluDense/wo/kernel         size 2359296      slice_size 1179648      Shape[d_ff=3072, d_model=768]                               \n",
      "INFO:tensorflow:Variable decoder/block_003/layer_000/SelfAttention/k                  size 589824       slice_size 294912       Shape[d_model=768, heads=768]                               \n",
      "INFO:tensorflow:Variable decoder/block_003/layer_000/SelfAttention/o                  size 589824       slice_size 294912       Shape[heads=768, d_model=768]                               \n",
      "INFO:tensorflow:Variable decoder/block_003/layer_000/SelfAttention/q                  size 589824       slice_size 294912       Shape[d_model=768, heads=768]                               \n",
      "INFO:tensorflow:Variable decoder/block_003/layer_000/SelfAttention/v                  size 589824       slice_size 294912       Shape[d_model=768, heads=768]                               \n",
      "INFO:tensorflow:Variable decoder/block_003/layer_001/EncDecAttention/k                size 589824       slice_size 294912       Shape[d_model=768, heads=768]                               \n",
      "INFO:tensorflow:Variable decoder/block_003/layer_001/EncDecAttention/o                size 589824       slice_size 294912       Shape[heads=768, d_model=768]                               \n",
      "INFO:tensorflow:Variable decoder/block_003/layer_001/EncDecAttention/q                size 589824       slice_size 294912       Shape[d_model=768, heads=768]                               \n",
      "INFO:tensorflow:Variable decoder/block_003/layer_001/EncDecAttention/v                size 589824       slice_size 294912       Shape[d_model=768, heads=768]                               \n",
      "INFO:tensorflow:Variable decoder/block_003/layer_002/DenseReluDense/wi/kernel         size 2359296      slice_size 1179648      Shape[d_model=768, d_ff=3072]                               \n",
      "INFO:tensorflow:Variable decoder/block_003/layer_002/DenseReluDense/wo/kernel         size 2359296      slice_size 1179648      Shape[d_ff=3072, d_model=768]                               \n",
      "INFO:tensorflow:Variable decoder/block_004/layer_000/SelfAttention/k                  size 589824       slice_size 294912       Shape[d_model=768, heads=768]                               \n",
      "INFO:tensorflow:Variable decoder/block_004/layer_000/SelfAttention/o                  size 589824       slice_size 294912       Shape[heads=768, d_model=768]                               \n",
      "INFO:tensorflow:Variable decoder/block_004/layer_000/SelfAttention/q                  size 589824       slice_size 294912       Shape[d_model=768, heads=768]                               \n",
      "INFO:tensorflow:Variable decoder/block_004/layer_000/SelfAttention/v                  size 589824       slice_size 294912       Shape[d_model=768, heads=768]                               \n",
      "INFO:tensorflow:Variable decoder/block_004/layer_001/EncDecAttention/k                size 589824       slice_size 294912       Shape[d_model=768, heads=768]                               \n",
      "INFO:tensorflow:Variable decoder/block_004/layer_001/EncDecAttention/o                size 589824       slice_size 294912       Shape[heads=768, d_model=768]                               \n",
      "INFO:tensorflow:Variable decoder/block_004/layer_001/EncDecAttention/q                size 589824       slice_size 294912       Shape[d_model=768, heads=768]                               \n",
      "INFO:tensorflow:Variable decoder/block_004/layer_001/EncDecAttention/v                size 589824       slice_size 294912       Shape[d_model=768, heads=768]                               \n",
      "INFO:tensorflow:Variable decoder/block_004/layer_002/DenseReluDense/wi/kernel         size 2359296      slice_size 1179648      Shape[d_model=768, d_ff=3072]                               \n",
      "INFO:tensorflow:Variable decoder/block_004/layer_002/DenseReluDense/wo/kernel         size 2359296      slice_size 1179648      Shape[d_ff=3072, d_model=768]                               \n",
      "INFO:tensorflow:Variable decoder/block_005/layer_000/SelfAttention/k                  size 589824       slice_size 294912       Shape[d_model=768, heads=768]                               \n",
      "INFO:tensorflow:Variable decoder/block_005/layer_000/SelfAttention/o                  size 589824       slice_size 294912       Shape[heads=768, d_model=768]                               \n",
      "INFO:tensorflow:Variable decoder/block_005/layer_000/SelfAttention/q                  size 589824       slice_size 294912       Shape[d_model=768, heads=768]                               \n",
      "INFO:tensorflow:Variable decoder/block_005/layer_000/SelfAttention/v                  size 589824       slice_size 294912       Shape[d_model=768, heads=768]                               \n",
      "INFO:tensorflow:Variable decoder/block_005/layer_001/EncDecAttention/k                size 589824       slice_size 294912       Shape[d_model=768, heads=768]                               \n",
      "INFO:tensorflow:Variable decoder/block_005/layer_001/EncDecAttention/o                size 589824       slice_size 294912       Shape[heads=768, d_model=768]                               \n",
      "INFO:tensorflow:Variable decoder/block_005/layer_001/EncDecAttention/q                size 589824       slice_size 294912       Shape[d_model=768, heads=768]                               \n",
      "INFO:tensorflow:Variable decoder/block_005/layer_001/EncDecAttention/v                size 589824       slice_size 294912       Shape[d_model=768, heads=768]                               \n",
      "INFO:tensorflow:Variable decoder/block_005/layer_002/DenseReluDense/wi/kernel         size 2359296      slice_size 1179648      Shape[d_model=768, d_ff=3072]                               \n",
      "INFO:tensorflow:Variable decoder/block_005/layer_002/DenseReluDense/wo/kernel         size 2359296      slice_size 1179648      Shape[d_ff=3072, d_model=768]                               \n",
      "INFO:tensorflow:Variable decoder/block_006/layer_000/SelfAttention/k                  size 589824       slice_size 294912       Shape[d_model=768, heads=768]                               \n",
      "INFO:tensorflow:Variable decoder/block_006/layer_000/SelfAttention/o                  size 589824       slice_size 294912       Shape[heads=768, d_model=768]                               \n",
      "INFO:tensorflow:Variable decoder/block_006/layer_000/SelfAttention/q                  size 589824       slice_size 294912       Shape[d_model=768, heads=768]                               \n",
      "INFO:tensorflow:Variable decoder/block_006/layer_000/SelfAttention/v                  size 589824       slice_size 294912       Shape[d_model=768, heads=768]                               \n",
      "INFO:tensorflow:Variable decoder/block_006/layer_001/EncDecAttention/k                size 589824       slice_size 294912       Shape[d_model=768, heads=768]                               \n",
      "INFO:tensorflow:Variable decoder/block_006/layer_001/EncDecAttention/o                size 589824       slice_size 294912       Shape[heads=768, d_model=768]                               \n",
      "INFO:tensorflow:Variable decoder/block_006/layer_001/EncDecAttention/q                size 589824       slice_size 294912       Shape[d_model=768, heads=768]                               \n",
      "INFO:tensorflow:Variable decoder/block_006/layer_001/EncDecAttention/v                size 589824       slice_size 294912       Shape[d_model=768, heads=768]                               \n",
      "INFO:tensorflow:Variable decoder/block_006/layer_002/DenseReluDense/wi/kernel         size 2359296      slice_size 1179648      Shape[d_model=768, d_ff=3072]                               \n",
      "INFO:tensorflow:Variable decoder/block_006/layer_002/DenseReluDense/wo/kernel         size 2359296      slice_size 1179648      Shape[d_ff=3072, d_model=768]                               \n",
      "INFO:tensorflow:Variable decoder/block_007/layer_000/SelfAttention/k                  size 589824       slice_size 294912       Shape[d_model=768, heads=768]                               \n",
      "INFO:tensorflow:Variable decoder/block_007/layer_000/SelfAttention/o                  size 589824       slice_size 294912       Shape[heads=768, d_model=768]                               \n",
      "INFO:tensorflow:Variable decoder/block_007/layer_000/SelfAttention/q                  size 589824       slice_size 294912       Shape[d_model=768, heads=768]                               \n",
      "INFO:tensorflow:Variable decoder/block_007/layer_000/SelfAttention/v                  size 589824       slice_size 294912       Shape[d_model=768, heads=768]                               \n",
      "INFO:tensorflow:Variable decoder/block_007/layer_001/EncDecAttention/k                size 589824       slice_size 294912       Shape[d_model=768, heads=768]                               \n",
      "INFO:tensorflow:Variable decoder/block_007/layer_001/EncDecAttention/o                size 589824       slice_size 294912       Shape[heads=768, d_model=768]                               \n",
      "INFO:tensorflow:Variable decoder/block_007/layer_001/EncDecAttention/q                size 589824       slice_size 294912       Shape[d_model=768, heads=768]                               \n",
      "INFO:tensorflow:Variable decoder/block_007/layer_001/EncDecAttention/v                size 589824       slice_size 294912       Shape[d_model=768, heads=768]                               \n",
      "INFO:tensorflow:Variable decoder/block_007/layer_002/DenseReluDense/wi/kernel         size 2359296      slice_size 1179648      Shape[d_model=768, d_ff=3072]                               \n",
      "INFO:tensorflow:Variable decoder/block_007/layer_002/DenseReluDense/wo/kernel         size 2359296      slice_size 1179648      Shape[d_ff=3072, d_model=768]                               \n",
      "INFO:tensorflow:Variable decoder/block_008/layer_000/SelfAttention/k                  size 589824       slice_size 294912       Shape[d_model=768, heads=768]                               \n",
      "INFO:tensorflow:Variable decoder/block_008/layer_000/SelfAttention/o                  size 589824       slice_size 294912       Shape[heads=768, d_model=768]                               \n",
      "INFO:tensorflow:Variable decoder/block_008/layer_000/SelfAttention/q                  size 589824       slice_size 294912       Shape[d_model=768, heads=768]                               \n",
      "INFO:tensorflow:Variable decoder/block_008/layer_000/SelfAttention/v                  size 589824       slice_size 294912       Shape[d_model=768, heads=768]                               \n",
      "INFO:tensorflow:Variable decoder/block_008/layer_001/EncDecAttention/k                size 589824       slice_size 294912       Shape[d_model=768, heads=768]                               \n",
      "INFO:tensorflow:Variable decoder/block_008/layer_001/EncDecAttention/o                size 589824       slice_size 294912       Shape[heads=768, d_model=768]                               \n",
      "INFO:tensorflow:Variable decoder/block_008/layer_001/EncDecAttention/q                size 589824       slice_size 294912       Shape[d_model=768, heads=768]                               \n",
      "INFO:tensorflow:Variable decoder/block_008/layer_001/EncDecAttention/v                size 589824       slice_size 294912       Shape[d_model=768, heads=768]                               \n",
      "INFO:tensorflow:Variable decoder/block_008/layer_002/DenseReluDense/wi/kernel         size 2359296      slice_size 1179648      Shape[d_model=768, d_ff=3072]                               \n",
      "INFO:tensorflow:Variable decoder/block_008/layer_002/DenseReluDense/wo/kernel         size 2359296      slice_size 1179648      Shape[d_ff=3072, d_model=768]                               \n",
      "INFO:tensorflow:Variable decoder/block_009/layer_000/SelfAttention/k                  size 589824       slice_size 294912       Shape[d_model=768, heads=768]                               \n",
      "INFO:tensorflow:Variable decoder/block_009/layer_000/SelfAttention/o                  size 589824       slice_size 294912       Shape[heads=768, d_model=768]                               \n",
      "INFO:tensorflow:Variable decoder/block_009/layer_000/SelfAttention/q                  size 589824       slice_size 294912       Shape[d_model=768, heads=768]                               \n",
      "INFO:tensorflow:Variable decoder/block_009/layer_000/SelfAttention/v                  size 589824       slice_size 294912       Shape[d_model=768, heads=768]                               \n",
      "INFO:tensorflow:Variable decoder/block_009/layer_001/EncDecAttention/k                size 589824       slice_size 294912       Shape[d_model=768, heads=768]                               \n",
      "INFO:tensorflow:Variable decoder/block_009/layer_001/EncDecAttention/o                size 589824       slice_size 294912       Shape[heads=768, d_model=768]                               \n",
      "INFO:tensorflow:Variable decoder/block_009/layer_001/EncDecAttention/q                size 589824       slice_size 294912       Shape[d_model=768, heads=768]                               \n",
      "INFO:tensorflow:Variable decoder/block_009/layer_001/EncDecAttention/v                size 589824       slice_size 294912       Shape[d_model=768, heads=768]                               \n",
      "INFO:tensorflow:Variable decoder/block_009/layer_002/DenseReluDense/wi/kernel         size 2359296      slice_size 1179648      Shape[d_model=768, d_ff=3072]                               \n",
      "INFO:tensorflow:Variable decoder/block_009/layer_002/DenseReluDense/wo/kernel         size 2359296      slice_size 1179648      Shape[d_ff=3072, d_model=768]                               \n",
      "INFO:tensorflow:Variable decoder/block_010/layer_000/SelfAttention/k                  size 589824       slice_size 294912       Shape[d_model=768, heads=768]                               \n",
      "INFO:tensorflow:Variable decoder/block_010/layer_000/SelfAttention/o                  size 589824       slice_size 294912       Shape[heads=768, d_model=768]                               \n",
      "INFO:tensorflow:Variable decoder/block_010/layer_000/SelfAttention/q                  size 589824       slice_size 294912       Shape[d_model=768, heads=768]                               \n",
      "INFO:tensorflow:Variable decoder/block_010/layer_000/SelfAttention/v                  size 589824       slice_size 294912       Shape[d_model=768, heads=768]                               \n",
      "INFO:tensorflow:Variable decoder/block_010/layer_001/EncDecAttention/k                size 589824       slice_size 294912       Shape[d_model=768, heads=768]                               \n",
      "INFO:tensorflow:Variable decoder/block_010/layer_001/EncDecAttention/o                size 589824       slice_size 294912       Shape[heads=768, d_model=768]                               \n",
      "INFO:tensorflow:Variable decoder/block_010/layer_001/EncDecAttention/q                size 589824       slice_size 294912       Shape[d_model=768, heads=768]                               \n",
      "INFO:tensorflow:Variable decoder/block_010/layer_001/EncDecAttention/v                size 589824       slice_size 294912       Shape[d_model=768, heads=768]                               \n",
      "INFO:tensorflow:Variable decoder/block_010/layer_002/DenseReluDense/wi/kernel         size 2359296      slice_size 1179648      Shape[d_model=768, d_ff=3072]                               \n",
      "INFO:tensorflow:Variable decoder/block_010/layer_002/DenseReluDense/wo/kernel         size 2359296      slice_size 1179648      Shape[d_ff=3072, d_model=768]                               \n",
      "INFO:tensorflow:Variable decoder/block_011/layer_000/SelfAttention/k                  size 589824       slice_size 294912       Shape[d_model=768, heads=768]                               \n",
      "INFO:tensorflow:Variable decoder/block_011/layer_000/SelfAttention/o                  size 589824       slice_size 294912       Shape[heads=768, d_model=768]                               \n",
      "INFO:tensorflow:Variable decoder/block_011/layer_000/SelfAttention/q                  size 589824       slice_size 294912       Shape[d_model=768, heads=768]                               \n",
      "INFO:tensorflow:Variable decoder/block_011/layer_000/SelfAttention/v                  size 589824       slice_size 294912       Shape[d_model=768, heads=768]                               \n",
      "INFO:tensorflow:Variable decoder/block_011/layer_001/EncDecAttention/k                size 589824       slice_size 294912       Shape[d_model=768, heads=768]                               \n",
      "INFO:tensorflow:Variable decoder/block_011/layer_001/EncDecAttention/o                size 589824       slice_size 294912       Shape[heads=768, d_model=768]                               \n",
      "INFO:tensorflow:Variable decoder/block_011/layer_001/EncDecAttention/q                size 589824       slice_size 294912       Shape[d_model=768, heads=768]                               \n",
      "INFO:tensorflow:Variable decoder/block_011/layer_001/EncDecAttention/v                size 589824       slice_size 294912       Shape[d_model=768, heads=768]                               \n",
      "INFO:tensorflow:Variable decoder/block_011/layer_002/DenseReluDense/wi/kernel         size 2359296      slice_size 1179648      Shape[d_model=768, d_ff=3072]                               \n",
      "INFO:tensorflow:Variable decoder/block_011/layer_002/DenseReluDense/wo/kernel         size 2359296      slice_size 1179648      Shape[d_ff=3072, d_model=768]                               \n",
      "INFO:tensorflow:Variable encoder/block_000/layer_000/SelfAttention/k                  size 589824       slice_size 294912       Shape[d_model=768, heads=768]                               \n",
      "INFO:tensorflow:Variable encoder/block_000/layer_000/SelfAttention/o                  size 589824       slice_size 294912       Shape[heads=768, d_model=768]                               \n",
      "INFO:tensorflow:Variable encoder/block_000/layer_000/SelfAttention/q                  size 589824       slice_size 294912       Shape[d_model=768, heads=768]                               \n",
      "INFO:tensorflow:Variable encoder/block_000/layer_000/SelfAttention/v                  size 589824       slice_size 294912       Shape[d_model=768, heads=768]                               \n",
      "INFO:tensorflow:Variable encoder/block_000/layer_001/DenseReluDense/wi/kernel         size 2359296      slice_size 1179648      Shape[d_model=768, d_ff=3072]                               \n",
      "INFO:tensorflow:Variable encoder/block_000/layer_001/DenseReluDense/wo/kernel         size 2359296      slice_size 1179648      Shape[d_ff=3072, d_model=768]                               \n",
      "INFO:tensorflow:Variable encoder/block_001/layer_000/SelfAttention/k                  size 589824       slice_size 294912       Shape[d_model=768, heads=768]                               \n",
      "INFO:tensorflow:Variable encoder/block_001/layer_000/SelfAttention/o                  size 589824       slice_size 294912       Shape[heads=768, d_model=768]                               \n",
      "INFO:tensorflow:Variable encoder/block_001/layer_000/SelfAttention/q                  size 589824       slice_size 294912       Shape[d_model=768, heads=768]                               \n",
      "INFO:tensorflow:Variable encoder/block_001/layer_000/SelfAttention/v                  size 589824       slice_size 294912       Shape[d_model=768, heads=768]                               \n",
      "INFO:tensorflow:Variable encoder/block_001/layer_001/DenseReluDense/wi/kernel         size 2359296      slice_size 1179648      Shape[d_model=768, d_ff=3072]                               \n",
      "INFO:tensorflow:Variable encoder/block_001/layer_001/DenseReluDense/wo/kernel         size 2359296      slice_size 1179648      Shape[d_ff=3072, d_model=768]                               \n",
      "INFO:tensorflow:Variable encoder/block_002/layer_000/SelfAttention/k                  size 589824       slice_size 294912       Shape[d_model=768, heads=768]                               \n",
      "INFO:tensorflow:Variable encoder/block_002/layer_000/SelfAttention/o                  size 589824       slice_size 294912       Shape[heads=768, d_model=768]                               \n",
      "INFO:tensorflow:Variable encoder/block_002/layer_000/SelfAttention/q                  size 589824       slice_size 294912       Shape[d_model=768, heads=768]                               \n",
      "INFO:tensorflow:Variable encoder/block_002/layer_000/SelfAttention/v                  size 589824       slice_size 294912       Shape[d_model=768, heads=768]                               \n",
      "INFO:tensorflow:Variable encoder/block_002/layer_001/DenseReluDense/wi/kernel         size 2359296      slice_size 1179648      Shape[d_model=768, d_ff=3072]                               \n",
      "INFO:tensorflow:Variable encoder/block_002/layer_001/DenseReluDense/wo/kernel         size 2359296      slice_size 1179648      Shape[d_ff=3072, d_model=768]                               \n",
      "INFO:tensorflow:Variable encoder/block_003/layer_000/SelfAttention/k                  size 589824       slice_size 294912       Shape[d_model=768, heads=768]                               \n",
      "INFO:tensorflow:Variable encoder/block_003/layer_000/SelfAttention/o                  size 589824       slice_size 294912       Shape[heads=768, d_model=768]                               \n",
      "INFO:tensorflow:Variable encoder/block_003/layer_000/SelfAttention/q                  size 589824       slice_size 294912       Shape[d_model=768, heads=768]                               \n",
      "INFO:tensorflow:Variable encoder/block_003/layer_000/SelfAttention/v                  size 589824       slice_size 294912       Shape[d_model=768, heads=768]                               \n",
      "INFO:tensorflow:Variable encoder/block_003/layer_001/DenseReluDense/wi/kernel         size 2359296      slice_size 1179648      Shape[d_model=768, d_ff=3072]                               \n",
      "INFO:tensorflow:Variable encoder/block_003/layer_001/DenseReluDense/wo/kernel         size 2359296      slice_size 1179648      Shape[d_ff=3072, d_model=768]                               \n",
      "INFO:tensorflow:Variable encoder/block_004/layer_000/SelfAttention/k                  size 589824       slice_size 294912       Shape[d_model=768, heads=768]                               \n",
      "INFO:tensorflow:Variable encoder/block_004/layer_000/SelfAttention/o                  size 589824       slice_size 294912       Shape[heads=768, d_model=768]                               \n",
      "INFO:tensorflow:Variable encoder/block_004/layer_000/SelfAttention/q                  size 589824       slice_size 294912       Shape[d_model=768, heads=768]                               \n",
      "INFO:tensorflow:Variable encoder/block_004/layer_000/SelfAttention/v                  size 589824       slice_size 294912       Shape[d_model=768, heads=768]                               \n",
      "INFO:tensorflow:Variable encoder/block_004/layer_001/DenseReluDense/wi/kernel         size 2359296      slice_size 1179648      Shape[d_model=768, d_ff=3072]                               \n",
      "INFO:tensorflow:Variable encoder/block_004/layer_001/DenseReluDense/wo/kernel         size 2359296      slice_size 1179648      Shape[d_ff=3072, d_model=768]                               \n",
      "INFO:tensorflow:Variable encoder/block_005/layer_000/SelfAttention/k                  size 589824       slice_size 294912       Shape[d_model=768, heads=768]                               \n",
      "INFO:tensorflow:Variable encoder/block_005/layer_000/SelfAttention/o                  size 589824       slice_size 294912       Shape[heads=768, d_model=768]                               \n",
      "INFO:tensorflow:Variable encoder/block_005/layer_000/SelfAttention/q                  size 589824       slice_size 294912       Shape[d_model=768, heads=768]                               \n",
      "INFO:tensorflow:Variable encoder/block_005/layer_000/SelfAttention/v                  size 589824       slice_size 294912       Shape[d_model=768, heads=768]                               \n",
      "INFO:tensorflow:Variable encoder/block_005/layer_001/DenseReluDense/wi/kernel         size 2359296      slice_size 1179648      Shape[d_model=768, d_ff=3072]                               \n",
      "INFO:tensorflow:Variable encoder/block_005/layer_001/DenseReluDense/wo/kernel         size 2359296      slice_size 1179648      Shape[d_ff=3072, d_model=768]                               \n",
      "INFO:tensorflow:Variable encoder/block_006/layer_000/SelfAttention/k                  size 589824       slice_size 294912       Shape[d_model=768, heads=768]                               \n",
      "INFO:tensorflow:Variable encoder/block_006/layer_000/SelfAttention/o                  size 589824       slice_size 294912       Shape[heads=768, d_model=768]                               \n",
      "INFO:tensorflow:Variable encoder/block_006/layer_000/SelfAttention/q                  size 589824       slice_size 294912       Shape[d_model=768, heads=768]                               \n",
      "INFO:tensorflow:Variable encoder/block_006/layer_000/SelfAttention/v                  size 589824       slice_size 294912       Shape[d_model=768, heads=768]                               \n",
      "INFO:tensorflow:Variable encoder/block_006/layer_001/DenseReluDense/wi/kernel         size 2359296      slice_size 1179648      Shape[d_model=768, d_ff=3072]                               \n",
      "INFO:tensorflow:Variable encoder/block_006/layer_001/DenseReluDense/wo/kernel         size 2359296      slice_size 1179648      Shape[d_ff=3072, d_model=768]                               \n",
      "INFO:tensorflow:Variable encoder/block_007/layer_000/SelfAttention/k                  size 589824       slice_size 294912       Shape[d_model=768, heads=768]                               \n",
      "INFO:tensorflow:Variable encoder/block_007/layer_000/SelfAttention/o                  size 589824       slice_size 294912       Shape[heads=768, d_model=768]                               \n",
      "INFO:tensorflow:Variable encoder/block_007/layer_000/SelfAttention/q                  size 589824       slice_size 294912       Shape[d_model=768, heads=768]                               \n",
      "INFO:tensorflow:Variable encoder/block_007/layer_000/SelfAttention/v                  size 589824       slice_size 294912       Shape[d_model=768, heads=768]                               \n",
      "INFO:tensorflow:Variable encoder/block_007/layer_001/DenseReluDense/wi/kernel         size 2359296      slice_size 1179648      Shape[d_model=768, d_ff=3072]                               \n",
      "INFO:tensorflow:Variable encoder/block_007/layer_001/DenseReluDense/wo/kernel         size 2359296      slice_size 1179648      Shape[d_ff=3072, d_model=768]                               \n",
      "INFO:tensorflow:Variable encoder/block_008/layer_000/SelfAttention/k                  size 589824       slice_size 294912       Shape[d_model=768, heads=768]                               \n",
      "INFO:tensorflow:Variable encoder/block_008/layer_000/SelfAttention/o                  size 589824       slice_size 294912       Shape[heads=768, d_model=768]                               \n",
      "INFO:tensorflow:Variable encoder/block_008/layer_000/SelfAttention/q                  size 589824       slice_size 294912       Shape[d_model=768, heads=768]                               \n",
      "INFO:tensorflow:Variable encoder/block_008/layer_000/SelfAttention/v                  size 589824       slice_size 294912       Shape[d_model=768, heads=768]                               \n",
      "INFO:tensorflow:Variable encoder/block_008/layer_001/DenseReluDense/wi/kernel         size 2359296      slice_size 1179648      Shape[d_model=768, d_ff=3072]                               \n",
      "INFO:tensorflow:Variable encoder/block_008/layer_001/DenseReluDense/wo/kernel         size 2359296      slice_size 1179648      Shape[d_ff=3072, d_model=768]                               \n",
      "INFO:tensorflow:Variable encoder/block_009/layer_000/SelfAttention/k                  size 589824       slice_size 294912       Shape[d_model=768, heads=768]                               \n",
      "INFO:tensorflow:Variable encoder/block_009/layer_000/SelfAttention/o                  size 589824       slice_size 294912       Shape[heads=768, d_model=768]                               \n",
      "INFO:tensorflow:Variable encoder/block_009/layer_000/SelfAttention/q                  size 589824       slice_size 294912       Shape[d_model=768, heads=768]                               \n",
      "INFO:tensorflow:Variable encoder/block_009/layer_000/SelfAttention/v                  size 589824       slice_size 294912       Shape[d_model=768, heads=768]                               \n",
      "INFO:tensorflow:Variable encoder/block_009/layer_001/DenseReluDense/wi/kernel         size 2359296      slice_size 1179648      Shape[d_model=768, d_ff=3072]                               \n",
      "INFO:tensorflow:Variable encoder/block_009/layer_001/DenseReluDense/wo/kernel         size 2359296      slice_size 1179648      Shape[d_ff=3072, d_model=768]                               \n",
      "INFO:tensorflow:Variable encoder/block_010/layer_000/SelfAttention/k                  size 589824       slice_size 294912       Shape[d_model=768, heads=768]                               \n",
      "INFO:tensorflow:Variable encoder/block_010/layer_000/SelfAttention/o                  size 589824       slice_size 294912       Shape[heads=768, d_model=768]                               \n",
      "INFO:tensorflow:Variable encoder/block_010/layer_000/SelfAttention/q                  size 589824       slice_size 294912       Shape[d_model=768, heads=768]                               \n",
      "INFO:tensorflow:Variable encoder/block_010/layer_000/SelfAttention/v                  size 589824       slice_size 294912       Shape[d_model=768, heads=768]                               \n",
      "INFO:tensorflow:Variable encoder/block_010/layer_001/DenseReluDense/wi/kernel         size 2359296      slice_size 1179648      Shape[d_model=768, d_ff=3072]                               \n",
      "INFO:tensorflow:Variable encoder/block_010/layer_001/DenseReluDense/wo/kernel         size 2359296      slice_size 1179648      Shape[d_ff=3072, d_model=768]                               \n",
      "INFO:tensorflow:Variable encoder/block_011/layer_000/SelfAttention/k                  size 589824       slice_size 294912       Shape[d_model=768, heads=768]                               \n",
      "INFO:tensorflow:Variable encoder/block_011/layer_000/SelfAttention/o                  size 589824       slice_size 294912       Shape[heads=768, d_model=768]                               \n",
      "INFO:tensorflow:Variable encoder/block_011/layer_000/SelfAttention/q                  size 589824       slice_size 294912       Shape[d_model=768, heads=768]                               \n",
      "INFO:tensorflow:Variable encoder/block_011/layer_000/SelfAttention/v                  size 589824       slice_size 294912       Shape[d_model=768, heads=768]                               \n",
      "INFO:tensorflow:Variable encoder/block_011/layer_001/DenseReluDense/wi/kernel         size 2359296      slice_size 1179648      Shape[d_model=768, d_ff=3072]                               \n",
      "INFO:tensorflow:Variable encoder/block_011/layer_001/DenseReluDense/wo/kernel         size 2359296      slice_size 1179648      Shape[d_ff=3072, d_model=768]                               \n",
      "INFO:tensorflow:Variable shared/embedding                                             size 24674304     slice_size 12337152     Shape[vocab=32128, d_model=768]                             \n",
      "INFO:tensorflow:Variable stacked/encoder/block_000/layer_000/SelfAttention/relative_attention_bias size 768          slice_size 384          Shape[stacked=2, heads=12, buckets=32]                      \n",
      "INFO:tensorflow:    encoder/block_000/layer_000/SelfAttention/relative_attention_bias\n",
      "INFO:tensorflow:    decoder/block_000/layer_000/SelfAttention/relative_attention_bias\n",
      "INFO:tensorflow:Variable stacked/encoder/block_000/layer_000/layer_norm/scale         size 47616        slice_size 47616        Shape[stacked=62, d_model=768]                              \n",
      "INFO:tensorflow:    encoder/block_000/layer_000/layer_norm/scale\n",
      "INFO:tensorflow:    encoder/block_000/layer_001/layer_norm/scale\n",
      "INFO:tensorflow:    encoder/block_001/layer_000/layer_norm/scale\n",
      "INFO:tensorflow:    encoder/block_001/layer_001/layer_norm/scale\n",
      "INFO:tensorflow:    encoder/block_002/layer_000/layer_norm/scale\n",
      "INFO:tensorflow:    encoder/block_002/layer_001/layer_norm/scale\n",
      "INFO:tensorflow:    encoder/block_003/layer_000/layer_norm/scale\n",
      "INFO:tensorflow:    encoder/block_003/layer_001/layer_norm/scale\n",
      "INFO:tensorflow:    encoder/block_004/layer_000/layer_norm/scale\n",
      "INFO:tensorflow:    encoder/block_004/layer_001/layer_norm/scale\n",
      "INFO:tensorflow:    encoder/block_005/layer_000/layer_norm/scale\n",
      "INFO:tensorflow:    encoder/block_005/layer_001/layer_norm/scale\n",
      "INFO:tensorflow:    encoder/block_006/layer_000/layer_norm/scale\n",
      "INFO:tensorflow:    encoder/block_006/layer_001/layer_norm/scale\n",
      "INFO:tensorflow:    encoder/block_007/layer_000/layer_norm/scale\n",
      "INFO:tensorflow:    encoder/block_007/layer_001/layer_norm/scale\n",
      "INFO:tensorflow:    encoder/block_008/layer_000/layer_norm/scale\n",
      "INFO:tensorflow:    encoder/block_008/layer_001/layer_norm/scale\n",
      "INFO:tensorflow:    encoder/block_009/layer_000/layer_norm/scale\n",
      "INFO:tensorflow:    encoder/block_009/layer_001/layer_norm/scale\n",
      "INFO:tensorflow:    encoder/block_010/layer_000/layer_norm/scale\n",
      "INFO:tensorflow:    encoder/block_010/layer_001/layer_norm/scale\n",
      "INFO:tensorflow:    encoder/block_011/layer_000/layer_norm/scale\n",
      "INFO:tensorflow:    encoder/block_011/layer_001/layer_norm/scale\n",
      "INFO:tensorflow:    encoder/final_layer_norm/scale\n",
      "INFO:tensorflow:    decoder/block_000/layer_000/layer_norm/scale\n",
      "INFO:tensorflow:    decoder/block_000/layer_001/layer_norm/scale\n",
      "INFO:tensorflow:    decoder/block_000/layer_002/layer_norm/scale\n",
      "INFO:tensorflow:    decoder/block_001/layer_000/layer_norm/scale\n",
      "INFO:tensorflow:    decoder/block_001/layer_001/layer_norm/scale\n",
      "INFO:tensorflow:    decoder/block_001/layer_002/layer_norm/scale\n",
      "INFO:tensorflow:    decoder/block_002/layer_000/layer_norm/scale\n",
      "INFO:tensorflow:    decoder/block_002/layer_001/layer_norm/scale\n",
      "INFO:tensorflow:    decoder/block_002/layer_002/layer_norm/scale\n",
      "INFO:tensorflow:    decoder/block_003/layer_000/layer_norm/scale\n",
      "INFO:tensorflow:    decoder/block_003/layer_001/layer_norm/scale\n",
      "INFO:tensorflow:    decoder/block_003/layer_002/layer_norm/scale\n",
      "INFO:tensorflow:    decoder/block_004/layer_000/layer_norm/scale\n",
      "INFO:tensorflow:    decoder/block_004/layer_001/layer_norm/scale\n",
      "INFO:tensorflow:    decoder/block_004/layer_002/layer_norm/scale\n",
      "INFO:tensorflow:    decoder/block_005/layer_000/layer_norm/scale\n",
      "INFO:tensorflow:    decoder/block_005/layer_001/layer_norm/scale\n",
      "INFO:tensorflow:    decoder/block_005/layer_002/layer_norm/scale\n",
      "INFO:tensorflow:    decoder/block_006/layer_000/layer_norm/scale\n",
      "INFO:tensorflow:    decoder/block_006/layer_001/layer_norm/scale\n",
      "INFO:tensorflow:    decoder/block_006/layer_002/layer_norm/scale\n",
      "INFO:tensorflow:    decoder/block_007/layer_000/layer_norm/scale\n",
      "INFO:tensorflow:    decoder/block_007/layer_001/layer_norm/scale\n",
      "INFO:tensorflow:    decoder/block_007/layer_002/layer_norm/scale\n",
      "INFO:tensorflow:    decoder/block_008/layer_000/layer_norm/scale\n",
      "INFO:tensorflow:    decoder/block_008/layer_001/layer_norm/scale\n",
      "INFO:tensorflow:    decoder/block_008/layer_002/layer_norm/scale\n",
      "INFO:tensorflow:    decoder/block_009/layer_000/layer_norm/scale\n",
      "INFO:tensorflow:    decoder/block_009/layer_001/layer_norm/scale\n",
      "INFO:tensorflow:    decoder/block_009/layer_002/layer_norm/scale\n",
      "INFO:tensorflow:    decoder/block_010/layer_000/layer_norm/scale\n",
      "INFO:tensorflow:    decoder/block_010/layer_001/layer_norm/scale\n",
      "INFO:tensorflow:    decoder/block_010/layer_002/layer_norm/scale\n",
      "INFO:tensorflow:    decoder/block_011/layer_000/layer_norm/scale\n",
      "INFO:tensorflow:    decoder/block_011/layer_001/layer_norm/scale\n",
      "INFO:tensorflow:    decoder/block_011/layer_002/layer_norm/scale\n",
      "INFO:tensorflow:    decoder/final_layer_norm/scale\n",
      "INFO:tensorflow:Trainable Variables            count: 195     Total size: 222903552        Total slice_size: 111475584      \n",
      "INFO:tensorflow:All Variables                  count: 203     Total size: 223390336        Total slice_size: 111816896      \n",
      "INFO:tensorflow:Counters:\n",
      "allconcat: 1.05e+06\n",
      " allconcat/0: 1.05e+06\n",
      "  allconcat/0/reshape_op: 1.05e+06\n",
      "allreduce: 8.34e+09\n",
      " allreduce/[0]: 2.1e+09\n",
      "  allreduce/[0]/einsum_op: 8.92e+08\n",
      "  allreduce/[0]/reduce_op: 1.21e+09\n",
      " allreduce/[1]: 6.24e+09\n",
      "  allreduce/[1]/einsum_op: 6.24e+09\n",
      "  allreduce/[1]/reduce_op: 1.45e+06\n",
      "einsum: 3.26e+13\n",
      "einsum_unique: 3.25e+13\n",
      "output: 1.07e+10\n",
      " output/AddOperation: 2.73e+06\n",
      " output/Constant: 8\n",
      " output/EinsumOperation: 3.57e+09\n",
      " output/ImportOperation: 6.3e+06\n",
      " output/MinMaxOperation: 4.11e+03\n",
      " output/ReduceOperation: 2.35e+06\n",
      " output/ReshapeOperation: 2.62e+06\n",
      " output/ScalarAddOperation: 8.92e+08\n",
      " output/ScalarMultiplyOperation: 7.82e+06\n",
      " output/SlicewiseOperation: 4.46e+09\n",
      " output/StackOperation: 2.99e+06\n",
      " output/StackedVariable: 2.99e+06\n",
      " output/UnstackOperation: 2.99e+06\n",
      " output/Variable: 8.92e+08\n",
      " output/WhileLoopOperation: 8.92e+08\n",
      "output_unique: 2.68e+09\n",
      " output_unique/AddOperation: 4.87e+05\n",
      " output_unique/Constant: 1\n",
      " output_unique/EinsumOperation: 8.92e+08\n",
      " output_unique/ImportOperation: 7.87e+05\n",
      " output_unique/MinMaxOperation: 514\n",
      " output_unique/ReduceOperation: 4.39e+05\n",
      " output_unique/ReshapeOperation: 9.18e+05\n",
      " output_unique/ScalarAddOperation: 2.23e+08\n",
      " output_unique/ScalarMultiplyOperation: 1.41e+06\n",
      " output_unique/SlicewiseOperation: 1.12e+09\n",
      " output_unique/StackOperation: 5.03e+05\n",
      " output_unique/StackedVariable: 5.03e+05\n",
      " output_unique/UnstackOperation: 5.03e+05\n",
      " output_unique/Variable: 2.23e+08\n",
      " output_unique/WhileLoopOperation: 2.23e+08\n",
      "variables: 2.23e+08\n",
      " variables/trainable: 2.23e+08\n",
      " variables/untrainable: 4.87e+05\n",
      "INFO:tensorflow:Create CheckpointSaverHook.\n",
      "INFO:tensorflow:Done calling model_fn.\n",
      "INFO:tensorflow:TPU job name worker\n",
      "INFO:tensorflow:Starting the session.\n",
      "INFO:tensorflow:Graph was finalized.\n",
      "INFO:tensorflow:Running local_init_op.\n",
      "INFO:tensorflow:Done running local_init_op.\n",
      "WARNING:tensorflow:From /home/ubuntu/.local/lib/python3.6/site-packages/tensorflow_estimator/python/estimator/tpu/tpu_estimator.py:751: Variable.load (from tensorflow.python.ops.variables) is deprecated and will be removed in a future version.\n",
      "Instructions for updating:\n",
      "Prefer Variable.assign which has equivalent behavior in 2.X.\n",
      "INFO:tensorflow:Initialized dataset iterators in 1 seconds\n",
      "INFO:tensorflow:Installing graceful shutdown hook.\n",
      "INFO:tensorflow:Creating heartbeat manager for ['/job:worker/replica:0/task:0/device:CPU:0']\n",
      "INFO:tensorflow:Configuring worker heartbeat: shutdown_mode: WAIT_FOR_COORDINATOR\n",
      "\n",
      "INFO:tensorflow:Starting infeed thread controller.\n",
      "INFO:tensorflow:Starting outfeed thread controller.\n",
      "INFO:tensorflow:Before copy master to slices.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Exception in thread Thread-5:\n",
      "Traceback (most recent call last):\n",
      "  File \"/home/ubuntu/.local/lib/python3.6/site-packages/tensorflow_core/python/distribute/cluster_resolver/tpu_cluster_resolver.py\", line 476, in _fetch_cloud_tpu_metadata\n",
      "    return request.execute()\n",
      "  File \"/home/ubuntu/.local/lib/python3.6/site-packages/googleapiclient/_helpers.py\", line 134, in positional_wrapper\n",
      "    return wrapped(*args, **kwargs)\n",
      "  File \"/home/ubuntu/.local/lib/python3.6/site-packages/googleapiclient/http.py\", line 898, in execute\n",
      "    raise HttpError(resp, content, uri=self.uri)\n",
      "googleapiclient.errors.HttpError: <HttpError 403 when requesting https://tpu.googleapis.com/v1/projects/None/locations/None/nodes/10.76.157.10:8470?alt=json returned \"Permission denied on resource project None.\". Details: \"[{'@type': 'type.googleapis.com/google.rpc.Help', 'links': [{'description': 'Google developer console API key', 'url': 'https://console.developers.google.com/project/None/apiui/credential'}]}]\">\n",
      "\n",
      "During handling of the above exception, another exception occurred:\n",
      "\n",
      "Traceback (most recent call last):\n",
      "  File \"/usr/lib/python3.6/threading.py\", line 916, in _bootstrap_inner\n",
      "    self.run()\n",
      "  File \"/home/ubuntu/.local/lib/python3.6/site-packages/tensorflow_core/python/tpu/preempted_hook.py\", line 87, in run\n",
      "    response = self._cluster._fetch_cloud_tpu_metadata()  # pylint: disable=protected-access\n",
      "  File \"/home/ubuntu/.local/lib/python3.6/site-packages/tensorflow_core/python/distribute/cluster_resolver/tpu_cluster_resolver.py\", line 480, in _fetch_cloud_tpu_metadata\n",
      "    \"constructor. Exception: %s\" % (self._tpu, e))\n",
      "ValueError: Could not lookup TPU metadata from name 'b'10.76.157.10:8470''. Please doublecheck the tpu argument in the TPUClusterResolver constructor. Exception: <HttpError 403 when requesting https://tpu.googleapis.com/v1/projects/None/locations/None/nodes/10.76.157.10:8470?alt=json returned \"Permission denied on resource project None.\". Details: \"[{'@type': 'type.googleapis.com/google.rpc.Help', 'links': [{'description': 'Google developer console API key', 'url': 'https://console.developers.google.com/project/None/apiui/credential'}]}]\">\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Done with copy master to slices.\n",
      "INFO:tensorflow:Saving checkpoints for 0 into gs://mesolitica-general/t5-base/model.ckpt.\n",
      "INFO:tensorflow:Before Save.\n",
      "INFO:tensorflow:About to write a checkpoint\n",
      "INFO:tensorflow:gs://mesolitica-general/t5-base/model.ckpt-0 is not in all_model_checkpoint_paths. Manually adding it.\n",
      "INFO:tensorflow:Done writing checkpoint.\n",
      "INFO:tensorflow:Enqueue next (100) batch(es) of data to infeed.\n",
      "INFO:tensorflow:Dequeue next (100) batch(es) of data from outfeed.\n",
      "INFO:tensorflow:Outfeed finished for iteration (0, 0)\n",
      "INFO:tensorflow:Outfeed finished for iteration (0, 46)\n",
      "INFO:tensorflow:Outfeed finished for iteration (0, 92)\n",
      "INFO:tensorflow:loss = 0.6796875, step = 100\n",
      "INFO:tensorflow:Stop infeed thread controller\n",
      "INFO:tensorflow:Shutting down InfeedController thread.\n",
      "INFO:tensorflow:InfeedController received shutdown signal, stopping.\n",
      "INFO:tensorflow:Infeed thread finished, shutting down.\n",
      "INFO:tensorflow:infeed marked as finished\n",
      "INFO:tensorflow:Stop output thread controller\n",
      "INFO:tensorflow:Shutting down OutfeedController thread.\n",
      "INFO:tensorflow:OutfeedController received shutdown signal, stopping.\n",
      "INFO:tensorflow:Outfeed thread finished, shutting down.\n",
      "INFO:tensorflow:outfeed marked as finished\n",
      "INFO:tensorflow:Shutdown TPU system.\n",
      "INFO:tensorflow:Saving checkpoints for 100 into gs://mesolitica-general/t5-base/model.ckpt.\n",
      "INFO:tensorflow:Before Save.\n",
      "INFO:tensorflow:About to write a checkpoint\n",
      "INFO:tensorflow:gs://mesolitica-general/t5-base/model.ckpt-100 is not in all_model_checkpoint_paths. Manually adding it.\n",
      "INFO:tensorflow:Done writing checkpoint.\n",
      "INFO:tensorflow:Done with the session.\n",
      "INFO:tensorflow:Loss for final step: 0.6796875.\n",
      "INFO:tensorflow:training_loop marked as finished\n"
     ]
    }
   ],
   "source": [
    "model.train(mixture_or_task_name='trivia_all', steps=100)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
